
import os
import psutil
from pathlib import Path
seed = 22

import joblib #保存模型
import numpy as np
import pandas as pd
from numpy import mean
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,MinMaxScaler

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score
from sklearn.metrics import  make_scorer
from sklearn.metrics import roc_auc_score,roc_curve,auc,accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
import seaborn as sns

import gc
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.float_format', lambda x: '%.4f' % x)

rc = {'font.sans-serif': 'SimSun',
      'axes.unicode_minus': False}
sns.set(context='notebook', style='ticks', rc=rc)
sns.set_theme(style="white",font='Times New Roman',font_scale=1.4)
config = {
    "font.family": 'serif', # 衬线字体
    "font.size": 12, # 相当于小四大小
    "font.serif": ['SimSun'], # 宋体
    "font.weight":'bold',
    "mathtext.fontset": 'stix', # matplotlib渲染数学字体时使用的字体，和Times New Roman差别不大
    'axes.unicode_minus': False # 处理负号，即-号
}
plt.rcParams.update(config)



def show_memory_info(hint):
    """
    定义一个显示内存情况的函数
    :param hint:
    :return:
    """
    # 获取当前进程的进程号
    pid = os.getpid()

    # psutil 是一个获取系统信息的库
    p = psutil.Process(pid)
    info = p.memory_full_info()
    memory = info.uss/1024./1024
    print(f"{hint} memory used: {memory} MB ")


def read_csv(file_name, num_rows):
    return pd.read_csv(file_name, nrows=num_rows)


# 内存压缩
def reduce_mem_usage(df, verbose=True):
	start_mem = df.memory_usage().sum() / 1024 ** 2
	numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

	for col in df.columns:
		col_type = df[col].dtypes
		if col_type in numerics:
			c_min = df[col].min()
			c_max = df[col].max()
			if str(col_type)[:3] == 'int':
				if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
					df[col] = df[col].astype(np.int8)
				elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
					df[col] = df[col].astype(np.int16)
				elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
					df[col] = df[col].astype(np.int32)
				elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
					df[col] = df[col].astype(np.int64)
			else:
				if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
					df[col] = df[col].astype(np.float16)
				elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
					df[col] = df[col].astype(np.float32)
				else:
					df[col] = df[col].astype(np.float64)

	end_mem = df.memory_usage().sum() / 1024 ** 2
	print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
	print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
	return df


# 加载数据
def load(file_path):
    num_rows = None
    train_file = file_path
    df_train = reduce_mem_usage(read_csv(train_file, num_rows))

    return df_train


#训练集测试集切分
def train_test_sp(data,test_size):
    fea = [x for x in data.columns if x not in['is_5g']]
    X_data = data[fea]
    y_data = data['is_5g']
    X_train, X_test, y_train, y_test = train_test_split(X_data,y_data,random_state=seed,test_size= test_size,stratify=y_data)
    return X_train, X_test, y_train, y_test

#将数据中空值、无穷替换为0
def drop_na(data):
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.fillna(0,inplace=True)
    return data



def df_to_csv(df_Str,df):
    file = df_Str
    filepath = Path(r'./output/'+file+'.csv')
    filepath.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(filepath,index =False)

#查看离散变量取值
def show_vl_cnt(train):
    int_cols =['prov_id',
         'chnl_type',
         'service_type',
         'product_type',
         'innet_months',
         'sex',
         'age',
         'manu_name',
         'term_type',
         'max_rat_flag',
         'is_5g_base_cover',
         'is_work_5g_cover',
         'is_home_5g_cover',
         'is_work_5g_cover_l01',
         'is_home_5g_cover_l01',
         'is_work_5g_cover_l02',
         'is_home_5g_cover_l02',
         'activity_type',
         'comp_type',
         'call_days',
         're_call10',
         'short_call10',
         'long_call10',
         'active_days01',
         'active_days02',
         'active_days03',
         'active_days04',
         'active_days05',
         'active_days06',
         'active_days07',
         'active_days08',
         'active_days09',
         'active_days10',
         'city_level']
    for col in int_cols:
        print(train[[col]].max())
        print(train[[col]].agg(['value_counts']).T)
    return int_cols


def to_int(df_train,df_test):
	# 手动转换特征存储类型
	int8_cols = ['prov_id',
				 'chnl_type',
				 'service_type',
				 'product_type',
				 'sex',
				 'age',
				 'manu_name',
				 'term_type',
				 'max_rat_flag',
				 'is_5g_base_cover',
				 'is_work_5g_cover',
				 'is_home_5g_cover',
				 'is_work_5g_cover_l01',
				 'is_home_5g_cover_l01',
				 'is_work_5g_cover_l02',
				 'is_home_5g_cover_l02',
				 'activity_type',
				 'comp_type',
				 'active_days01',
				 'active_days02',
				 'active_days03',
				 'active_days04',
				 'active_days05',
				 'active_days06',
				 'active_days07',
				 'active_days08',
				 'active_days09',
				 'active_days10',
				 'city_level']
	int16_cols = ['innet_months', 'call_days', 're_call10', 'short_call10', 'long_call10']

	df_train[int8_cols] = df_train[int8_cols].astype(np.int8)
	df_train.is_5g = df_train.is_5g.astype(np.int8)
	df_test[int8_cols] = df_test[int8_cols].astype(np.int8)
	df_train[int16_cols] = df_train[int16_cols].astype(np.int16)
	df_test[int16_cols] = df_test[int16_cols].astype(np.int16)
	# 数值型变量  float
	float_cols = ['total_times', 'total_flux', 'total_fee', 'pay_fee', 'game_app_flux', 'live_app_flux',
				  'video_app_flux', 'city_5g_ratio']
	df_train[float_cols] = df_train[float_cols].astype('float')
	df_test[float_cols] = df_test[float_cols].astype('float')

	return df_train,df_test


# 饼状图
def func(pct, count):
	absolute = int(pct / 100. * np.sum(count))  # 根据pct百分比  迭代还原原始数值
	return "{:.2f}%\n".format(pct)


def pie1(data):
	fig, ax = plt.subplots(figsize=(8, 8))
	labels = ['非5G用户', '5G用户']
	ax.pie(data, autopct=lambda pct: func(pct, data),
		   wedgeprops={'linewidth': 1, 'edgecolor': 'black', 'alpha': 0.5, 'width': 0.5},
		   # labels=labels,
		   labeldistance=1.1,
		   shadow=False, textprops={'fontsize': 16}, radius=0.9,
		   rotatelabels=True,
		   # startangle=90,
		   # explode = (0,0.1)
		   )
	ax.legend(labels, loc=1, fontsize=13)
	# ax.set_xlabel('5G用户占比',fontsize = 16)
	plt.savefig('./images/5G用户占比.jpg')
	plt.show()

#核密度图
def plot_kde(X_train,X_test,int_cols):
	fig = plt.figure(figsize=(60, 60), dpi=75)
	for i, column in zip(range(len(df_train[int_cols].columns)), df_train[int_cols].columns):
		plt.subplot(6, 6, i + 1)
		g = sns.kdeplot(X_train[column], color="Red", shade=True)
		g = sns.kdeplot(X_test[column], ax=g, color="Blue", shade=True)
		g.set_xlabel(column)
		g.set_ylabel("Frequency")
		g.legend(["train", "test"])
	plt.savefig('./训练测试样本kde.jpg')
	plt.show()


# 计算类别变量 与 5g关系  解决 空类别变量
def type_per(col_name, keys, counts):
	per1 = []
	for col, val in zip(keys, counts):
		# count_type =np.sum(df_train[df_train['is_5g']==1][col_name]==col)
		count_type = np.sum(df_train[df_train['is_5g'] == 1][col_name] == col)
		per1.append(count_type / val)
	return per1

def lisan_plot(df_train,X_train):
	class_col = ['chnl_type', 'service_type',
				 'product_type', 'sex', 'age', 'manu_name', 'term_type',
				 'max_rat_flag', 'is_5g_base_cover', 'is_work_5g_cover',
				 'is_home_5g_cover', 'is_work_5g_cover_l01', 'is_home_5g_cover_l01',
				 'is_work_5g_cover_l02', 'is_home_5g_cover_l02', 'activity_type',
				 'comp_type', 'city_level']
	# 类别变量 查看与5G关系

	plt.figure(figsize=(30, 20), dpi=200)

	for i, column in zip(range(len(class_col)), df_train[class_col].columns):
		plt.subplot(3, 6, i + 1)
		x = X_train[column].value_counts()
		per_5g = type_per(column, x.keys(), x.values)
		# print(per_5g)
		plt.bar(x.keys(), height=per_5g, width=0.4, alpha=0.6)
		plt.title(column + '的正负样本的分布情况')
	plt.savefig('./训练集离散变量正负样本的分布情况.jpg')
	plt.show()

def lianxu_plot(X_train_y):
	float_col = ['total_times', 'total_flux', 'total_fee', 'pay_fee', 'game_app_flux', 'live_app_flux',
				 'video_app_flux', 'city_5g_ratio',
				 'innet_months', 'call_days', 're_call10', 'short_call10', 'long_call10',
				 'active_days01', 'active_days02', 'active_days03', 'active_days04',
				 'active_days05', 'active_days06', 'active_days07', 'active_days08',
				 'active_days09', 'active_days10']

	fig = plt.figure(figsize=(50, 40), dpi=75)
	for i, column in zip(range(len(float_col)), float_col):
		plt.subplot(6, 6, i + 1)
		g = sns.kdeplot(X_train_y[X_train_y.is_5g == 1][column], color="Red", shade=True)
		g1 = sns.kdeplot(X_train_y[X_train_y.is_5g != 1][column], ax=g, color="Blue", shade=True)
		# g.set_xlabel(column)
		# g.set_ylabel("Frequency")
		g = g.legend(["5G用户", "非5G用户"])
		plt.title(column + '的正负样本分布情况')
	plt.savefig('./训练集连续变量正负样本分布情况.jpg')
	plt.show()

def yichang_xiangxian(data1, label):
	# 查看连续变量分布情况 箱线图
	num_cols = ['total_times', 'total_flux', 'total_fee', 'pay_fee', 'game_app_flux',
				'live_app_flux', 'video_app_flux', 'innet_months',
				'call_days', 're_call10', 'short_call10', 'long_call10']

	plt.figure(figsize=(40, 15))
	for i in range(len(num_cols)):
		plt.subplot(2, 6, i + 1)
		sns.boxplot(data=data1[num_cols[i]], orient="v", width=0.5)
		plt.xlabel(num_cols[i], fontsize=30)
	plt.savefig('./im异常检测箱线图' + label + '.jpg')
	plt.show()

#删除异常值 1.2倍
def del_yichang(train ,test):
	num_cols = ['total_times', 'total_flux', 'total_fee', 'pay_fee', 'game_app_flux',
				'live_app_flux', 'video_app_flux', 'innet_months',
				'call_days', 're_call10', 'short_call10', 'long_call10']
	cnt = 0
	for i in num_cols:
		maxi = test[i].max()
		for x, y in zip(range(len(train[i])), train[i]):
			if y > 1.2 * maxi:
				train.drop(x, axis=0, inplace=True)

def lianxu_his_plot():
	num_cols = ['total_times', 'total_flux', 'total_fee', 'pay_fee', 'game_app_flux',
				'live_app_flux', 'video_app_flux', 'innet_months',
				'call_days', 're_call10', 'short_call10', 'long_call10']
	plt.figure(figsize=(40, 15))
	for i in range(len(num_cols)):
		plt.subplot(2, 6, i + 1)
		sns.histplot(x=df_train[num_cols[i]], bins=60, edgecolor="black", alpha=0.7)
		plt.xlabel(num_cols[i], fontsize=30)
	plt.savefig('./连续变量偏态分布.png')
	#plt.title('连续变量分布直方图')
	plt.show()

def corr_map(train,title,plot_save = True):

    config1 = {
        "font.family": 'serif', # 衬线字体
        "font.size": 9, # 相当于小四大小
        "font.serif": ['SimSun'], # 宋体
        "mathtext.fontset": 'stix', # matplotlib渲染数学字体时使用的字体，和Times New Roman差别不大
        'axes.unicode_minus': False # 处理负号，即-号
    }
    plt.rcParams.update(config1)

    # 找出相关程度
    plt.figure(figsize=(25,25))  # 指定绘图对象宽度和高度
#     colnm = train.columns.tolist()  # 列表头
#     mcorr = train[colnm].corr(method="spearman")  # 相关系数矩阵，即给出了任意两个变量之间的相关系数
    mcorr =train.corr(method="spearman")
    #print(mcorr)
    mask = np.zeros_like(mcorr, dtype=np.bool)  # 构造与mcorr同维数矩阵 为bool型
    mask[np.triu_indices_from(mask)] = True  # 角分线右侧为True
    cmap = sns.diverging_palette(220, 10, as_cmap=True)  # 返回matplotlib colormap对象
    g = sns.heatmap(data=mcorr, mask=mask, cmap=cmap, square=True,
                    #annot=True,
                    #fmt='0.2f',
                    cbar=True,cbar_kws={"shrink": 0.6})  # 热力图（看两两相似度）
    if plot_save:
        plt.savefig('./images/'+title+'.jpg')
    plt.show()
    plt.rcParams.update(config)

#特征构造
def gouzao(train):
	# 用户上网区域是否5g
	train['5g_cover'] = 0.4 * train['is_5g_base_cover'] + 0.3 * train['is_work_5g_cover'] + 0.3 * train[
		'is_home_5g_cover']

	# 平均每天通话次数
	train['avg_call_cnt'] = (train['long_call10'] + train['short_call10'] + train['re_call10']) / train['call_days']
	#     test['avg_call_cnt'] =( test['long_call10'] +test['short_call10']+test['re_call10'])/test['call_days']
	train['avg_call_cnt'] = train['avg_call_cnt'].astype(np.float16)

	# 平均每天长通话次数
	train['avg_long_call_cnt'] = train['long_call10'] / train['call_days']
	#     test['avg_long_call_cnt'] = test['long_call10'] /test['call_days']
	train['avg_long_call_cnt'] = train['avg_long_call_cnt'].astype(np.float16)

	# 平均每天短通话次数
	train['avg_short_call_cnt'] = train['short_call10'] / train['call_days']
	#     test['avg_short_call_cnt'] = test['short_call10'] /test['call_days']
	train['avg_short_call_cnt'] = train['avg_short_call_cnt'].astype(np.float16)

	# 平均每天重拨通话次数
	train['avg_recall_cnt'] = train['re_call10'] / train['call_days']
	#     test['avg_recall_cnt'] = test['re_call10'] /test['call_days']
	train['avg_recall_cnt'] = train['avg_recall_cnt'].astype(np.float16)

	# innet_months 分箱
	bins = [0, 12, 24, 48, 1000]
	score_cat = pd.cut(train['innet_months'], bins, labels=['1', '2', '3', '4'])
	train['innet_months_box'] = score_cat.values
	df_train['innet_months_box'] = train['innet_months_box'].astype(np.int8)

	# 终端品牌占比
	# for key, val in zip( df_train['manu_name'].value_counts().keys(), df_train['manu_name'].value_counts().values):
	#     print(key,val/700000)
	manu_name_ratio = dict(
		zip(train['manu_name'].value_counts().keys(), train['manu_name'].value_counts().values / 700000))
	train['manu_name_ratio'] = train['manu_name'].map(manu_name_ratio)
	# df_train.drop('manu_name_cnt',axis=1,inplace=True)
	# app总流量
	train['app_flux'] = df_train['live_app_flux'].corr(df_train.is_5g) * 10 * df_train['live_app_flux'] + \
						df_train['game_app_flux'].corr(df_train.is_5g) * 10 * df_train['game_app_flux'] + \
						df_train['video_app_flux'].corr(df_train.is_5g) * 10 * df_train['video_app_flux'] / max(
		df_train['video_app_flux'])

	#     test['app_flux'] = test['live_app_flux']+test['video_app_flux']+test['game_app_flux']
	# app使用次数
	train['app_num'] = train['active_days01'] + \
					   train['active_days02'] + \
					   train['active_days03'] + \
					   train['active_days04'] + \
					   train['active_days05'] + \
					   train['active_days06'] + \
					   train['active_days07'] + \
					   train['active_days08'] + \
					   train['active_days09'] + \
					   train['active_days10']


# 套餐中网络类型占比
def zhanbi(x, col):
	return np.count(x) / len(x)

def col_cnt(dict1, grp1):
	agg_dict = dict()
	for name, group in grp1:
		agg_dict[str(name[0]) + str(name[1])] = group.agg('count')[0] / dict1.get(name[0])
	return agg_dict

def concat_str(data1, data2):
	list1 = []
	for a, b in zip(data1, data2):
		list1.append(str(a) + str(b))
	return list1


# 计算二级索引 在一级索引中的数量和 占比
def index2_ratio(dict1, grp1):
	cnt_dict = dict()
	ratio_dict = dict()
	for name, group in grp1:
		# print(name)
		# 将group 中的两级索引 name 分别取出并拼在一起 并将当前group 使用count计数  最后除以对应一级索引统计数
		cnt_dict[str(name[0]) + str(name[1])] = group.agg('count')[0]
		ratio_dict[str(name[0]) + str(name[1])] = group.agg('count')[0] / dict1.get(name[0])
	return cnt_dict, ratio_dict


def pca_gxl(data,title,rate = 0.99):
    length = len(data.columns)
    std = StandardScaler()
    std_data = std.fit_transform(data)
    pca=PCA().fit(std_data)
    plt.figure(figsize=[12,6])
    index_cum =[]
    sort_cum = np.cumsum(pca.explained_variance_ratio_)
    for i,j in zip(range(len(sort_cum)),sort_cum):
        if j >= rate:
            index_cum = i
            break
    num = int(rate*100)
    plt.plot(np.cumsum(pca.explained_variance_ratio_),marker='+')
    plt.plot([i]*num,np.arange(0,rate,0.01),'r')
    print(i)
    plt.xlabel("特征数量")
    plt.ylabel(title)
    plt.grid()
    plt.savefig('./images/'+title+'.jpg')
    plt.show()
    return pca

def drop_(data):
	drop_col = ['product_type_service_type',
				 'chnl_type_service_type', 'prov_id_product_type', 'prov_id_service_type',
				 'manu_name_term_type', 'active_days01', 'active_days02', 'active_days03', 'active_days04',
				 'active_days05', 'active_days06', 'active_days07', 'active_days08',
				 'active_days09', 'active_days10',
				 'activity_type_comp_type',
				 'is_5g', 'area_id', 'user_id', 'video_app_flux', 'live_app_flux', 'manu_name_ratio'
				 ]
	for col in drop_col:
		column = data.columns
		if col not in column:
			pass
		else:
			data.drop(col, axis=1, inplace=True)
	return data

def select_by_lgb(train_data,train_label,random_state=20,n_splits=5,metric='auc',num_round=10000,early_stopping_rounds=200):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    feature_importances = pd.DataFrame()
    feature_importances['feature'] = train_data.columns
    fold=0
    print(train_data.columns)
    print(train_data.info())
    for train_idx, val_idx in kfold.split(train_data):
        random_state+=1
        train_x = train_data.iloc[train_idx]
        train_y =train_label.iloc[train_idx]
        test_x =train_data.iloc[val_idx]
        test_y =  train_label.iloc[val_idx]
        clf=lgb
        train_matrix = clf.Dataset(train_x,label=train_y)
        test_matrix=clf.Dataset(test_x,label=test_y)
        params={
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'learning_rate': 0.1,
                'metric': metric,
                'seed': seed,
                'nthread':-1 }
        model=clf.train(params,train_matrix,num_round,valid_sets=test_matrix,early_stopping_rounds=early_stopping_rounds,verbose_eval= 1)
        feature_importances['fold_{}'.format(fold + 1)] = model.feature_importance()
        #print(feature_importances)
        fold+=1
    feature_importances['averge']=feature_importances[['fold_{}'.format(i) for i in range(1,n_splits+1)]].mean(axis=1)
    return feature_importances

def final_cols(df_train):
	# fina_col = feature_importances['feature'][:30].values
	fina_col = [
		'activity_type',
		'age',
		'call_days',
		'chnl_type',
		'city_5g_ratio',
		'city_level',
		'comp_type',
		'innet_months',
		'long_call10',
		'manu_name',
		'pay_fee',
		'product_type',
		'prov_id',
		'is_5g_base_cover',
		'sex',
		'short_call10',
		'term_type',
		'total_fee',
		'total_flux',
		'total_times',
		'app_flux',
		'app_num',
		'avg_call_cnt',
		'product_type_service_type_ratio',
		'prov_id_product_type_cnt',
		'prov_id_product_type_ratio',
		'prov_id_service_type_cnt',
		'prov_id_service_type_ratio']

	# 原始样本
	final_train = df_train[fina_col]

	X_train, X_test, y_train, y_test = train_test_sp(df_train, 0.2)
	return X_train, X_test, y_train, y_test



# 网格搜索 欠采样 过采样参数  决策树基模型
def caiyang1(k_values, st,X_train, y_train):

	for k in k_values:
		# define pipeline
		model = DecisionTreeClassifier()
		over = SMOTE(sampling_strategy=0.2, k_neighbors=k, random_state=seed)
		for j in st:
			under = RandomUnderSampler(sampling_strategy=j, random_state=seed)
			# under = RandomUnderSampler(sampling_strategy=0.5,random_state=seed)
			steps1 = [('over', over), ('under', under), ('model', model)]
			steps2 = [('over', over), ('model', model)]
			pipeline1 = Pipeline(steps=steps1)
			pipeline2 = Pipeline(steps=steps2)
			# evaluate pipeline
			cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
			scores1 = cross_val_score(pipeline1, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
			score1 = scores1.mean()
			scores2 = cross_val_score(pipeline2, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
			score2 = scores2.mean()

			print('欠采样：k=%d,sampling_strategy=%.2f Mean ROC AUC: %.3f' % (k, j, score1))
			print('未欠采样：k=%d,sampling_strategy=%.2f Mean ROC AUC: %.3f' % (k, j, score2))

# 选择最优的参数进行综合采样
def caiyang3(X_train,y_train):
    k=30
    over_sampling_strategy=0.6
    sm = SMOTE(sampling_strategy=0.1,k_neighbors=k,random_state=seed)
    X_train_over,y_train_over = sm.fit_resample(X_train,y_train)

    rus = RandomUnderSampler(sampling_strategy=over_sampling_strategy,random_state=seed)
    X_train_under,y_train_under = rus.fit_resample(X_train_over,y_train_over)
    return X_train_under,y_train_under


# AUC 曲线
def plot_AUC(model,X_test,y_test,name):
    probs = model.predict_proba(X_test)
    preds = probs[:,1]
    fpr, tpr, threshold = roc_curve(y_test, preds)
    roc_auc = auc(fpr, tpr)

    #plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.savefig('./images/'+name+'模型ROC曲线.jpg',dpi=300,bbox_inches='tight')
    plt.show()


def plot_Matrix(cm, names, plot, classes,title = None):
	fig, ax = plt.subplots(ncols=1, figsize=(5, 5))
	sns.heatmap(cm,
				annot=True, ax=ax, fmt='d',
				linewidths=.2, linecolor="Darkblue", cmap="Blues", annot_kws={'size': 15})

	fig.tight_layout()
	if plot:
		plt.savefig('./images/' + names + '最优模型预测结果混淆矩阵' + '.jpg', dpi=300, bbox_inches='tight')
	plt.show()


# thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
def plot_all(predictions, model_name, plot):
	thresholds = [0.5]

	j = 1
	for threshold in thresholds:
		predictions_new = np.zeros([len(X_test), 1])
		predictions_new[predictions[:, 1] > threshold] = 1
		# 获得矩阵
		conf = confusion_matrix(y_test, predictions_new)
		# 画图
		plot_Matrix(conf, model_name, plot, classes=[0, 1], title='threshod is {}'.format(threshold))
		# 召回率
		recall = conf[1, 1] / (conf[1, 0] + conf[1, 1])
		j = j + 1
		acc = accuracy_score(y_test, predictions_new)
		auc = roc_auc_score(y_test, predictions_new)

		print('threshold:' + str(threshold))
		print(classification_report(y_test, predictions_new))
		print('recall :' + str(recall))
		print('accuracy:' + str(acc))
		print('auc:' + str(auc))
	return acc, recall, auc


def plt_importances(model, name):
	# 获取特征重要性
	importances = model.feature_importances_
	std_importances = importances / sum(importances)
	# 获取特征名称
	feat_names = X_train_under.columns
	# 排序
	indices = np.argsort(importances)[::-1]
	# 绘图
	plt.figure(figsize=(10, 12))
	plt.barh(range(len(indices)), std_importances[indices], color='lightblue', align="center")
	plt.yticks(range(len(indices)), feat_names[indices], fontsize=14)
	plt.savefig('./images/' + name + '累计特征重要性.jpg', dpi=600, bbox_inches='tight')
	plt.show()

def train_dtree(x,y):
	pipeline = Pipeline([
		('clf', DecisionTreeClassifier(criterion='gini'
									   , random_state=seed
									   ))
	])
	parameters = {
		'clf__max_depth': (5, 10, 20, 40),
		'clf__min_samples_split': (2, 3, 5),
		'clf__min_samples_leaf': (2, 5, 10),
		'clf__class_weight': (None, 'balanced')
	}
	# GridSearchCV 用于系统地遍历多种参数组合，通过交叉验证确定最佳效果参数。
	dt_grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring=my_roc, cv=cv)
	dt_grid_search.fit(x, y)
	# 获取搜索到的最优参数
	best_parameters = dt_grid_search.best_estimator_.get_params()
	print("最好的roc值为：", dt_grid_search.best_score_)
	print('最好的参数为：')
	for param_name in sorted(parameters.keys()):
		print('t%s: %r' % (param_name, best_parameters[param_name]))

	return dt_grid_search
	'''
	Fitting 5 folds for each of 90 candidates, totalling 450 fits
	最好的roc值为： 0.8877463181546614
	最好的参数为：
	tclf__class_weight: 'balanced'
	tclf__max_depth: 20
	tclf__min_samples_leaf: 1
	tclf__min_samples_split: 2
	'''


def rf_cv1(X_train_under,y_train_under):
	# 2.我们首先对n_estimators进行网格搜索：

	scorel = []
	for i in range(0, 200, 10):
		rfc = RandomForestClassifier(n_estimators=i + 1, n_jobs=-1, random_state=90)
		score = cross_val_score(rfc, X_train_under, y_train_under, cv=5).mean()
		scorel.append(score)
	print(max(scorel), '使分数最高的n_estimators值为：', scorel.index(max(scorel)) * 10 + 1)
	plt.figure()
	plt.plot(range(1, 201, 10), scorel)
	plt.show()
	return scorel.index(max(scorel)) * 10 + 1

def rf_cv2(X_train_under, y_train_under,n_estimators):
	# 3.接着我们对决策树最大深度max_depth进行网格搜索。
	param_test2 = {'max_depth': [1, 3, 5, 7, 9, 11]}
	gsearch2 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=n_estimators, min_samples_split=100,
															 min_samples_leaf=20, max_features='sqrt', oob_score=True,
															 random_state=10),
							param_grid=param_test2, scoring='roc_auc', cv=5)
	gsearch2.fit(X_train_under, y_train_under)
	print(gsearch2.best_params_, gsearch2.best_score_)
	'''
	{'max_depth': 11} 0.9476268748394926
	'''
	return  gsearch2,gsearch2.best_params_.max_depth


def rf_cv3(X_train_under, y_train_under,n_estimators,max_depth):
	# 4.下面我们再对内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参。
	n_estimators = n_estimators
	max_depth = max_depth
	param_test3 = {'min_samples_split': [10, 40, 80, 100], 'min_samples_leaf': [10, 40, 60, 80]}
	gsearch3 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
															 max_features='sqrt', oob_score=True, random_state=seed),
							param_grid=param_test3, scoring='roc_auc', cv=5)
	gsearch3.fit(X_train_under, y_train_under)
	print(gsearch3.best_params_, gsearch3.best_score_)
	return gsearch3 ,gsearch3.best_params_.min_samples_split,gsearch3.best_params_.min_samples_leaf

def rf_cv4(X_train_under, y_train_under,n_estimators,max_depth,min_samples_split,min_samples_leaf):
	param_test4 = {'max_features': [5, 10, 15, 20]}
	gsearch4 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
															 min_samples_split=min_samples_split,
															 min_samples_leaf=min_samples_leaf, oob_score=True,
															 random_state=seed, verbose=3, n_jobs=-1),
							param_grid=param_test4, scoring='roc_auc', cv=5)
	try:
		gsearch4.fit(X_train_under, y_train_under)
		print(gsearch4.best_params_, gsearch4.best_score_)
	except KeyboardInterrupt as e:
		pass
	return  gsearch4 , gsearch4.best_params_.max_features


def gbm_cv1(X_train_under,y_train_under):
	params = {
		'boosting_type': 'gbdt',
		'objective': 'binary',
		'metric': 'auc',
		# 'device_type':'gpu',
		'learning_rate': 0.1,
		'num_leaves': 30,
		'max_depth': 5,
		'subsample': 0.8,
		'colsample_bytree': 0.8,
	}

	data_train = lgb.Dataset(X_train_under, y_train_under)

	cv_results = lgb.cv(params, data_train, num_boost_round=500, nfold=5, stratified=False, shuffle=True, metrics='auc',
						early_stopping_rounds=50, seed=0)
	print('best n_estimators:', len(cv_results['auc-mean']))
	print('best cv score:', pd.Series(cv_results['auc-mean']).max())
	return  len(cv_results['auc-mean'])

def gbm_cv2(X_train_under, y_train_under,):
	params_test1 = {'max_depth': range(4, 7, 1), 'num_leaves': range(5, 70, 10)}

	gsearch1 = GridSearchCV(
		estimator=lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metrics='auc', learning_rate=0.1,
									 n_estimators=500, bagging_fraction=0.8, feature_fraction=0.8),
		param_grid=params_test1, scoring=my_roc, cv=5, n_jobs=-1, verbose=3)
	gsearch1.fit(X_train_under, y_train_under)
	return  gsearch1,gsearch1.best_params_.max_depth,gsearch1.best_params_.num_leaves

def gbm_cv3(X_train_under, y_train_under,n_estimators,max_depth,num_leaves):
	params_test2 = {'max_bin': range(5, 106, 20), 'min_data_in_leaf': range(1, 102, 10)}

	gsearch2 = GridSearchCV(
		estimator=lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metrics='auc', learning_rate=0.1,
									 n_estimators=n_estimators, max_depth=max_depth , num_leaves=num_leaves, bagging_fraction=0.8,
									 feature_fraction=0.8),
		param_grid=params_test2, scoring=my_roc, cv=5, n_jobs=-1)
	gsearch2.fit(X_train_under, y_train_under)
	return  gsearch2 , gsearch2.best_params_.max_bin,gsearch2.best_params_.min_data_in_leaf


def gbm_cv3(X_train_under, y_train_under,n_estimators,max_depth,num_leaves,max_bin,min_data_in_leaf):
	params_test3 = {'feature_fraction': [0.6, 0.7, 0.8, 0.9],
					'bagging_fraction': [0.6, 0.7, 0.8, 0.9],
					'bagging_freq': range(10, 81, 10)
					}

	gsearch3 = GridSearchCV(
		estimator=lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metrics='auc', learning_rate=0.1,
									 n_estimators=n_estimators, max_depth=max_depth, num_leaves=num_leaves,
									 max_bin=max_bin, min_data_in_leaf=min_data_in_leaf),
		param_grid=params_test3, scoring=my_roc, cv=5, n_jobs=-1)
	gsearch3.fit(X_train_under, y_train_under)
	gsearch3.best_estimator_, gsearch3.best_params_, gsearch3.best_score_
	return  gsearch3,gsearch3.best_params_.feature_fraction ,gsearch3.best_params_.bagging_fraction ,gsearch3.best_params_.bagging_freq



def gbm_cv5(X_train_under, y_train_under,n_estimators,max_depth,num_leaves,max_bin,min_data_in_leaf,
			feature_fraction,bagging_fraction,bagging_freq):
	params_test4 = {'lambda_l1': [1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9],
					'lambda_l2': [1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9]
					}

	gsearch4 = GridSearchCV(
		estimator=lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metrics='auc', learning_rate=0.1,
									 n_estimators=n_estimators, max_depth=max_depth, num_leaves=num_leaves,
									 max_bin=max_bin, min_data_in_leaf=min_data_in_leaf
									 , bagging_fraction=bagging_fraction, bagging_freq=bagging_freq,
									 feature_fraction=feature_fraction),
		param_grid=params_test4, scoring=my_roc, cv=5, n_jobs=-1)
	gsearch4.fit(X_train_under, y_train_under)
	return gsearch4,gsearch4.best_params_.lambda_l1,gsearch4.best_params_.lambda_l2

def gbm_cv6(X_train_under, y_train_under,n_estimators,max_depth,num_leaves,max_bin,min_data_in_leaf,
			feature_fraction,bagging_fraction,bagging_freq,lambda_l1,lambda_l2):
	params_test5 = {'min_split_gain': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
	gsearch5 = GridSearchCV(
		estimator=lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metrics='auc', learning_rate=0.1,
									 n_estimators=n_estimators, max_depth=max_depth, num_leaves=num_leaves,
									 max_bin=max_bin, min_data_in_leaf=min_data_in_leaf
									 , bagging_fraction=bagging_fraction,
									 bagging_freq=bagging_freq, feature_fraction=feature_fraction
									 , lambda_l1=lambda_l1, lambda_l2=lambda_l2),
		param_grid=params_test5, scoring=my_roc, cv=5, n_jobs=-1)
	gsearch5.fit(X_train_under, y_train_under)
	return   gsearch5,gsearch5.best_params_.min_split_gain

if __name__ == '__main__':
	df_train = load('./output/train.csv')
	df_test = load('./output/test.csv')

	# 查看缺失值
	print('训练集缺失值：%d' % (df_train.isnull().sum().sum()) )
	df_train,df_test = to_int(df_train, df_test)
	# print(df_train.info())
	# print(df_test.info())

	#统计正负样本数量
	num_5g = df_train.is_5g.value_counts().values
	#pie1(num_5g)

	int_cols = show_vl_cnt(df_train)
	#正负样本分布
	#plot_kde(df_train,df_test)

	X_train, X_test, y_train, y_test = train_test_sp(df_train, 0.2)
	X_train_y = X_train.copy()
	X_train_y['is_5g'] = y_train

	#lisan_plot(df_train,X_train)
	#lianxu_plot(df_train)

	#yichang_xiangxian(df_train,'处理前')

	del_yichang(df_train,df_test)
	#yichang_xiangxian(df_train, '处理后')

	#lianxu_his_plot()

	#corr_map(X_train, '原始变量相关性热力图', plot_save=True)

	gc.collect()
	show_memory_info('')

	#特征构造
	gouzao(df_train)

	grp1 = df_train.groupby(['product_type', 'service_type'])
	dict1 = dict(df_train.groupby(['product_type']).agg('count')['user_id'])
	product_type_service_type_map = col_cnt(dict1, grp1)
	df_train['product_type_service_type'] = concat_str(df_train['product_type'], df_train['service_type'])
	df_train['product_type_service_type_ratio'] = df_train['product_type_service_type'].map(
		product_type_service_type_map)
	df_train['product_type_service_type'] = df_train['product_type_service_type'].astype(np.int8)

	grp1 = df_train.groupby(['prov_id','service_type'])
	dict1 = dict(df_train.groupby(['prov_id']).agg('count')['user_id'])
	prov_id_service_type_cnt_map , prov_id_service_type_ratio_map = index2_ratio(dict1,grp1)
	df_train['prov_id_service_type'] = concat_str(df_train['prov_id'],df_train['service_type'])
	df_train['prov_id_service_type_cnt'] = df_train['prov_id_service_type'].map(prov_id_service_type_cnt_map)
	df_train['prov_id_service_type_ratio'] = df_train['prov_id_service_type'].map(prov_id_service_type_ratio_map)
	df_train['prov_id_service_type_cnt'] = df_train['prov_id_service_type_cnt'].astype(np.int16)
	df_train['prov_id_service_type'] = df_train['prov_id_service_type'].astype(np.int16)

	grp1 = df_train.groupby(['prov_id','product_type'])
	dict1 = dict(df_train.groupby(['prov_id']).agg('count')['user_id'])
	prov_id_product_type_cnt_map , prov_id_product_type_ratio_map = index2_ratio(dict1,grp1)
	df_train['prov_id_product_type'] = concat_str(df_train['prov_id'],df_train['product_type'])
	df_train['prov_id_product_type_cnt'] = df_train['prov_id_product_type'].map(prov_id_product_type_cnt_map)
	df_train['prov_id_product_type_ratio'] = df_train['prov_id_product_type'].map(prov_id_product_type_ratio_map)
	df_train['prov_id_product_type_cnt'] = df_train['prov_id_product_type_cnt'].astype(np.int16)
	df_train['prov_id_product_type'] = df_train['prov_id_product_type'].astype(np.int16)

	#corr_map(df_train,'特征融合后变量间相关系数热力图',True)

	drop_na(df_train)
	reduce_mem_usage(df_train)
	x_train = df_train.drop(['is_5g', 'area_id', 'user_id'], 1)
	#pca_gxl(x_train,'特征构造后累计特征贡献率')

	df_train_drop = df_train.copy()
	#删除无意义变量
	drop_(df_train_drop)

	# feature_importances = select_by_lgb(df_train_drop,df_train.is_5g)
	# feature_importances.sort_values('averge',ascending=False,inplace=True)
	# print(feature_importances)
	#
	# X_train, X_test, y_train, y_test = final_cols(df_train)
	# df_to_csv('X_train',X_train)
	# df_to_csv('X_test',X_test)
	# df_to_csv('y_train',y_train)
	# df_to_csv('y_test',y_test)
	#
	X_train = reduce_mem_usage(drop_na(pd.read_csv('./output/X_train.csv', index_col=False)))
	X_test = reduce_mem_usage(drop_na(pd.read_csv('./output/X_test.csv', index_col=False)))
	y_train = reduce_mem_usage(drop_na(pd.read_csv('./output/y_train.csv', index_col=False)))
	y_test = reduce_mem_usage(drop_na(pd.read_csv('./output/y_test.csv', index_col=False)))


	#网格搜索采样比例
	k_values = [5, 10, 20, 30]
	sampling_strategy = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
	print("------------------采样 ----------------------")
	caiyang1(k_values, sampling_strategy,X_train,y_train)

	#X_train_under, y_train_under = caiyang3(X_train, y_train)

	# df_to_csv('X_train_under_new', X_train_under)
	# df_to_csv('y_train_under_new', y_train_under)

	X_train_under = reduce_mem_usage(drop_na(pd.read_csv('./output/X_train_under_new.csv', index_col=False)))
	y_train_under = reduce_mem_usage(drop_na(pd.read_csv('./output/y_train_under_new.csv', index_col=False)))

	#模型训练
	# 5折交叉验证
	folds = 5
	# 自定义 分层 K-Folds 交叉验证器。 每折保证与原始数据集相同的类别比例。
	cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

	# 自定义 f1 评估参数  代价敏感学习  基于样本数量加权
	my_f1 = make_scorer(f1_score, average='weighted')
	#  自定义 roc  评估参数
	my_roc = make_scorer(roc_auc_score, average='weighted')

	#决策树默认模型
	dt_model = DecisionTreeClassifier()
	dt_model.fit(X_train_under, y_train_under)
	print(dt_model.get_params())
	#plot_all(dt_model.predict_proba(X_test), '决策树', False)

	#决策树调参
	#dt_cv = train_dtree(X_train_under,y_train_under)
	# 决策树 调参模型结果
	#joblib.dump(dt_cv, './model/DecisionTree.joblib')
	dt_cv = joblib.load('./model/DecisionTree.joblib')

	dTree = DecisionTreeClassifier(criterion='gini', max_depth=20,
								   class_weight='balanced',
								   min_samples_leaf=2, min_samples_split=2,
								   random_state=seed)
	dTree.fit(X_train_under, y_train_under)

	plt_importances(dTree, '决策树')
	# 输出预测结果并评价
	predictions = dTree.predict(X_test)
	predictions_proba = dTree.predict_proba(X_test)
	plot_all(predictions_proba, '决策树', plot=True)


	#随机森林默认模型 -----------------------------------
	rf0 = RandomForestClassifier(oob_score=True, random_state=seed)
	rf0.fit(X_train_under, y_train_under)
	print(rf0.oob_score_)
	y_predprob = rf0.predict_proba(X_test)
	plot_all(y_predprob, '随机森林', plot=False)


	#随机森林调参
	n_estimators =  rf_cv1()

	gsearch2 ,max_depth = rf_cv2(X_train_under, y_train_under,n_estimators)
	joblib.dump(gsearch2, './model/RFC2.joblib')
	gsearch2 = joblib.load('./model/RFC2.joblib')

	gsearch3,min_samples_split,min_samples_leaf = rf_cv3(X_train_under, y_train_under,n_estimators,max_depth)
	joblib.dump(gsearch2, './model/RFC3.joblib')
	gsearch2 = joblib.load('./model/RFC3.joblib')

	gsearch4,max_features = rf_cv4(X_train_under, y_train_under,n_estimators,max_depth,min_samples_split,min_samples_leaf)
	joblib.dump(gsearch4, './model/RFC_gsearch4.joblib')
	gsearch4 = joblib.load('./model/RFC_gsearch4.joblib')

	rf2 = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split,
								 min_samples_leaf=min_samples_leaf, max_features=max_features, oob_score=True,
								 random_state=seed)
	rf2.fit(X_train_under, y_train_under)
	print(rf2.oob_score_)
	joblib.dump(rf2, './model/rf2_fina.joblib')
	rf_model = joblib.load('./model/rf2_fina.joblib')

	plt_importances(rf_model, '随机森林')
	plot_all(rf_model.predict_proba(X_test), '随机森林', True)

	#Lightgbm默认参数模型 -------------------------------
	lgb_model_mr = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metrics='auc')
	lgb_model_mr.fit(X_train_under, y_train_under)
	joblib.dump(lgb_model_mr, './lgb_model_mr.joblib')
	# 输出预测结果并评价
	predictions_proba = lgb_model_mr.predict_proba(X_test)
	plot_all(predictions_proba, 'lgb', plot=False)

	# Lightgbm调参
	n_estimators = gbm_cv1(X_train_under, y_train_under)
	gsearch1 ,max_depth ,num_leaves = gbm_cv2(X_train_under, y_train_under,n_estimators)
	joblib.dump(gsearch1, './model/lgb_model1.joblib')
	gsearch1 = joblib.load('./model/lgb_model1.joblib')

	gsearch2 , max_bin ,min_data_in_leaf = gbm_cv3(X_train_under, y_train_under,n_estimators,max_depth ,num_leaves )
	joblib.dump(gsearch2, './model/lgb_model2.joblib')
	gsearch2 = joblib.load('./model/lgb_model2.joblib')


	gsearch3, feature_fraction ,bagging_fraction ,bagging_freq = \
			gbm_cv3(X_train_under, y_train_under, n_estimators, max_depth, num_leaves, max_bin, min_data_in_leaf)
	joblib.dump(gsearch3,'./model/lgb_model3.joblib')
	gsearch3 = joblib.load('./model/lgb_model3.joblib')

	gsearch4, lambda_l1, lambda_l2 = \
			gbm_cv5(X_train_under, y_train_under,n_estimators,max_depth,num_leaves,max_bin,min_data_in_leaf,feature_fraction,bagging_fraction,bagging_freq)
	gsearch4.best_params_, gsearch4.best_score_
	joblib.dump(gsearch4,'./model/lgb_model4.joblib')
	gsearch4 = joblib.load('./model/lgb_model4.joblib')

	gsearch5, min_split_gain \
	 = gbm_cv6(X_train_under, y_train_under, n_estimators, max_depth, num_leaves, max_bin, min_data_in_leaf,
					feature_fraction, bagging_fraction, bagging_freq, lambda_l1, lambda_l2)
	joblib.dump(gsearch5, './model/lgb_model5.joblib')
	gsearch5 = joblib.load('./model/lgb_model5.joblib')

	# 第七步：降低学习率，增加迭代次数，验证模型
	lgb_model = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metrics='auc', learning_rate=0.01,
								   n_estimators=n_estimators, max_depth=max_depth, num_leaves=num_leaves,
								   max_bin=max_bin, min_data_in_leaf=min_data_in_leaf
								   , bagging_fraction=bagging_fraction, bagging_freq=bagging_freq,
								   feature_fraction=feature_fraction,
								   lambda_l1=lambda_l1, lambda_l2=lambda_l2, min_split_gain=min_split_gain)

	lgb_model.fit(X_train_under, y_train_under)
	joblib.dump(lgb_model, './model/lgb_model.joblib')
	lgb_model = joblib.load('./model/lgb_model.joblib')
	#  输出预测结果并评价
	predictions_proba = lgb_model.predict_proba(X_test)
	plot_all(predictions_proba,'lgb',plot = True)
	plt_importances(lgb_model, 'Lightgbm')